Code
import tensorflow_datasets as tfds
# Download the subword encoded pretokenized dataset
dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True)
# Get the tokenizer
tokenizer = info.features['text'].encodersubwords positive/negative IMDB movie review
BUFFER_SIZE = 10000
BATCH_SIZE = 256
# Get the train and test splits
train_data, test_data = dataset['train'], dataset['test'],
# Shuffle the training data
train_dataset = train_data.shuffle(BUFFER_SIZE)
# Batch and pad the datasets to the maximum length of the sequences
train_dataset = train_dataset.padded_batch(BATCH_SIZE)
test_dataset = test_data.padded_batch(BATCH_SIZE)import tensorflow as tf
# Hyperparameters
embedding_dim = 32
lstm_dim = 32
dense_dim = 16
# Build the model
model = tf.keras.Sequential([
tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim,return_sequence=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
tf.keras.layers.Dense(dense_dim, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])import matplotlib.pyplot as plt
# Plot utility
def plot_graphs(history, string):
plt.plot(history.history[string])
plt.plot(history.history['val_'+string])
plt.xlabel("Epochs")
plt.ylabel(string)
plt.legend([string, 'val_'+string])
plt.show()
# Plot the accuracy and results
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")https://www.coursera.org/learn/natural-language-processing-tensorflow
https://github.com/https-deeplearning-ai/tensorflow-1-public/tree/main/C3
---
title: "W3:Sequence models"
execute:
warning: false
error: false
format:
html:
toc: true
toc-location: right
code-fold: show
code-tools: true
number-sections: true
code-block-bg: true
code-block-border-left: "#31BAE9"
---
Week3 Sequence models
In the last couple of weeks you looked first at Tokenizing words to get numeric values from them, and then using Embeddings to group words of similar meaning depending on how they were labelled. This gave you a good, but rough, sentiment analysis -- words such as 'fun' and 'entertaining' might show up in a positive movie review, and 'boring' and 'dull' might show up in a negative one. But sentiment can also be determined by the sequence in which words appear. For example, you could have 'not fun', which of course is the opposite of 'fun'. This week you'll start digging into a variety of model formats that are used in training models to understand context in sequence!
```{python}
import tensorflow as tf
```
```{python}
import sys
print(sys.version)
```
```{python}
import os
os.system('pip3 show tensorflow')
```
```{python}
import os
os.system('pip3 show keras')
```
# LSTMs
## download data
subwords positive/negative IMDB movie review
```{python}
import tensorflow_datasets as tfds
# Download the subword encoded pretokenized dataset
dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True)
# Get the tokenizer
tokenizer = info.features['text'].encoder
```
## make training and testing data
```{python}
BUFFER_SIZE = 10000
BATCH_SIZE = 256
# Get the train and test splits
train_data, test_data = dataset['train'], dataset['test'],
# Shuffle the training data
train_dataset = train_data.shuffle(BUFFER_SIZE)
# Batch and pad the datasets to the maximum length of the sequences
train_dataset = train_dataset.padded_batch(BATCH_SIZE)
test_dataset = test_data.padded_batch(BATCH_SIZE)
```
## define model
```{python}
import tensorflow as tf
# Hyperparameters
embedding_dim = 32
lstm_dim = 32
dense_dim = 16
# Build the model
model = tf.keras.Sequential([
tf.keras.layers.Embedding(tokenizer.vocab_size, 32),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
```
```{python}
# Print the model summary
model.summary()
```
## compile model
```{python}
# Set the training parameters
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
```
## training model
```{python}
NUM_EPOCHS = 3
history = model.fit(train_dataset, epochs=NUM_EPOCHS, validation_data=test_dataset)
```
## model result
```{python}
import matplotlib.pyplot as plt
# Plot utility
def plot_graphs(history, string):
plt.plot(history.history[string])
plt.plot(history.history['val_'+string])
plt.xlabel("Epochs")
plt.ylabel(string)
plt.legend([string, 'val_'+string])
plt.show()
# Plot the accuracy and results
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")
```
# save model
```{python}
# Save the entire model as a `.keras` zip archive.
from keras.models import load_model, save_model
save_model(model,'c3week3_movie_review_model.keras')
```
# load model
```{python}
new_model = load_model('c3week3_movie_review_model.keras')
```
```{python}
new_model.summary()
```
# resource:
https://www.coursera.org/learn/natural-language-processing-tensorflow
https://github.com/https-deeplearning-ai/tensorflow-1-public/tree/main/C3